For the visualization of homocedastic and hetercedastic variance, we simulate:
\[ y \sim N \left ( -1 + 2x, 1 \right ) \]
The funnel-shaped heterocedasticy is simulated using:
\[ y \sim N \left ( -1 + 2x, \left ( 0.1 + 0.3 \left ( x + 3 \right ) \right )^{2} \right ) \]
par(mfrow = c(2, 2))
# Homecedastic variance
X <- seq(-2, 2, length.out = 300)
homecedastic_random <- function (x){
rnorm(1, mean = -1 + 2 * x, sd = 1)[1]
}
homo_y <- sapply(X, homecedastic_random)
homo_e <- (-1 + 2 * X) - homo_y
plot(X, homo_y, xlab='', ylab='', main='homocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, homo_e, xlab='', ylab='', main='homocedastic variance errors')
abline(0, 0, col = "red", lwd=2)
heterocedastic_random <- function (x){
rnorm(1, mean = -1 + 2 * x, sd = 0.1 + 0.3 * (x + 3))[1]
}
hetero_y <- sapply(X, heterocedastic_random)
hetero_e <- (-1 + 2 * X) - hetero_y
plot(X, hetero_y, xlab='', ylab='', main='funnel-shaped heterocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, hetero_e, xlab='', ylab='', main='funnel-shaped heterocedastic variance errors')
abline(0, 0, col = "red", lwd=2)
par(mfrow = c(1, 2))
munich_rent_url <- "https://www.uni-goettingen.de/de/document/download/64c29c1b1fccb142cfa8f29a942a9e05.raw/rent99.raw"
munich_rent_index <- read.table(
url(munich_rent_url),
header = 1,
colClasses = c(
"numeric", "numeric", "numeric",
"numeric", "factor", "factor",
"factor", "factor", "factor"
)
)
simple_reg <- lm(rent ~ area, data = munich_rent_index)
plot(munich_rent_index$area, munich_rent_index$rent, xlab = 'area in sqm', ylab = 'net rent in Euro')
abline(simple_reg, col = "red", lwd = 2)
predicted <- predict(simple_reg)
plot(munich_rent_index$area, predicted - munich_rent_index$rent, xlab = 'area in sqm', ylab = 'residuals')
abline(0, 0, col = "red", lwd = 2)
We simulate data using:
\[ y_{i} = \exp(1 + x_{i1} - x_{i2} + \epsilon_{i}) \]
With: \[ \epsilon_{i} \sim N \left ( 0, {0.4}^{2} \right ) \]
We plot this model:
par(mfrow = c(2, 2))
n <- 50
x1 <- seq(0, 3, length.out = n)
x2 <- seq(0, 3, length.out = n)
x_grid <- expand.grid(x1 = x1, x2 = x2)
epsilon <- rnorm(n^2, mean = 0, sd = 0.4)
y <- exp(1 + x_grid$x1 - x_grid$x2 + epsilon)
plot(
x_grid$x1, y,
main = 'scatter plot: y versus x1',
xlab = 'x1',
ylab = 'y'
)
plot(
x_grid$x2, y,
main = 'scatter plot: y versus x2',
xlab = 'x2',
ylab = 'y'
)
plot(
x_grid$x1, log(y),
main = 'scatter plot: log(y) versus x1',
xlab = 'x1',
ylab = 'log(y)'
)
plot(
x_grid$x2, log(y),
main = 'scatter plot: log(y) versus x2',
xlab = 'x2',
ylab = 'log(y)'
)
Data not available online.
Importing data using script:
source("import_data/munich_rent_index.R")
We do the regression model:
\[ \text{rentsqm}_{i} = \beta_{0} + \beta_{1} \cdot f(\text{area}_{i}) + \epsilon_{i} \]
With both \(f(\cdot)\):
\[ f(\text{area}_{i}) = \text{area}_{i} \]
For linear model and:
\[ f(\text{area}_{i}) = \frac{1}{\text{area}_{i}} \]
For future use we define those models:
Model 1 (\(M1\)):
\[ \mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} \]
Model 2 (\(M2\)):
\[ \mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \frac{1}{\text{area}} \]
For nonlinear relationship between \(\text{rentsqm}\) and \(\text{area}\). The plotted graph below are those two models with the left panels the linear regression without the transformation and the right panels with the inverse transformation.
par(mfrow = c(3, 2))
m1 <- lm(rentsqm ~ area, data = munich_rent_index)
summary(m1)
Call:
lm(formula = rentsqm ~ area, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9622 -1.5737 -0.1102 1.5861 9.9674
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.46883 0.12426 76.20 <2e-16 ***
area -0.03499 0.00174 -20.11 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.291 on 3080 degrees of freedom
Multiple R-squared: 0.1161, Adjusted R-squared: 0.1158
F-statistic: 404.5 on 1 and 3080 DF, p-value: < 2.2e-16
m2 <- lm(rentsqm ~ I(1 / area), data = munich_rent_index)
summary(m2)
Call:
lm(formula = rentsqm ~ I(1/area), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.3963 -1.5733 -0.0921 1.5824 10.1287
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.7321 0.1084 43.65 <2e-16 ***
I(1/area) 140.1783 5.9287 23.64 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.241 on 3080 degrees of freedom
Multiple R-squared: 0.1536, Adjusted R-squared: 0.1533
F-statistic: 559 on 1 and 3080 DF, p-value: < 2.2e-16
plot_rentsqm_area <- function(title) {
plot(
munich_rent_index$area,
munich_rent_index$rentsqm,
main = title,
xlab = 'area in sqm',
ylab = 'rent per sqm'
)
}
seq_area <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)
plot_rentsqm_area('rent per sqm vs. area')
m1_beta <- m1$coefficients
lines(seq_area, m1_beta[1] + m1_beta[2] * seq_area, col = 'red', lwd = 2)
plot_rentsqm_area('f(area) = 1/area')
m2_beta <- m2$coefficients
lines(seq_area, m2_beta[1] + m2_beta[2] * 1 / seq_area, col = 'red', lwd = 2)
plot_residuals <- function(model) {
plot(
munich_rent_index$area,
model$residuals,
main = 'residuals',
xlab = 'area in sqm',
ylab = 'residuals'
)
abline(0, 0, col = 'red', lwd = 2)
}
plot_residuals(m1)
plot_residuals(m2)
plot_av_residuals <- function(model) {
av_residuals <- tapply(
model$residuals, munich_rent_index$area,
mean
)
plot(
unique(munich_rent_index$area),
av_residuals,
main = 'average residuals',
xlab = 'area in sqm',
ylab = 'residuals'
)
abline(0, 0, col = 'red', lwd = 2)
}
plot_av_residuals(m1)
plot_av_residuals(m2)
For polynomial regressions we adjust two different model. Of second-order (Model 3 (\(M3\))):
\[ \mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} + \beta_{2} \text{area}^{2} \]
And third-order (Model 4 (\(M4\))):
\[ \mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} + \beta_{2} \text{area}^{2} + \beta_{3} \text{area}^{3} \]
par(mfrow = c(2, 2))
m3 <- lm(rentsqm ~ area + I(area^2), data = munich_rent_index)
summary(m3)
Call:
lm(formula = rentsqm ~ area + I(area^2), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.2150 -1.5816 -0.0915 1.5869 9.9392
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.183e+01 2.684e-01 44.081 <2e-16 ***
area -1.057e-01 7.351e-03 -14.380 <2e-16 ***
I(area^2) 4.707e-04 4.758e-05 9.892 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.255 on 3079 degrees of freedom
Multiple R-squared: 0.1433, Adjusted R-squared: 0.1428
F-statistic: 257.6 on 2 and 3079 DF, p-value: < 2.2e-16
m4 <- lm(rentsqm ~ area + I(area^2) + I(area^3), data = munich_rent_index)
summary(m4)
Call:
lm(formula = rentsqm ~ area + I(area^2) + I(area^3), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.4269 -1.5854 -0.1101 1.5948 10.0670
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.428e+01 5.730e-01 24.915 < 2e-16 ***
area -2.175e-01 2.432e-02 -8.946 < 2e-16 ***
I(area^2) 1.981e-03 3.167e-04 6.254 4.54e-10 ***
I(area^3) -6.105e-06 1.266e-06 -4.823 1.49e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.247 on 3078 degrees of freedom
Multiple R-squared: 0.1497, Adjusted R-squared: 0.1489
F-statistic: 180.7 on 3 and 3078 DF, p-value: < 2.2e-16
plot_rentsqm_area('rent per sqm vs. area')
m3_beta <- m3$coefficients
lines(seq_area, m3_beta[1] + m3_beta[2] * seq_area + m3_beta[3] * seq_area ^ 2, col = 'red', lwd = 2)
plot_rentsqm_area('f(area) = 1/area')
m4_beta <- m4$coefficients
lines(seq_area, m4_beta[1] + m4_beta[2] * seq_area + m4_beta[3] * seq_area ^ 2 + m4_beta[4] * seq_area ^ 3, col = 'red', lwd = 2)
plot_residuals(m3)
plot_residuals(m4)
We define the following aditive models. Additive model 1:
\[ \mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i} \]
And a second one with \(\text{yearc}\) as a polinomial of degree 3:
\[ \mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i} + \beta_{3} \cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3} \]
additive_m1 <- lm(rentsqm ~ I(1/area) + yearc, data = munich_rent_index)
summary(additive_m1)
Call:
lm(formula = rentsqm ~ I(1/area) + yearc, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.2682 -1.4894 -0.1401 1.3935 9.0582
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -65.406008 3.351088 -19.52 <2e-16 ***
I(1/area) 119.360735 5.636182 21.18 <2e-16 ***
yearc 0.036033 0.001721 20.94 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.097 on 3079 degrees of freedom
Multiple R-squared: 0.2591, Adjusted R-squared: 0.2586
F-statistic: 538.5 on 2 and 3079 DF, p-value: < 2.2e-16
additive_m2 <- lm(rentsqm ~ I(1/area) + yearc + I(yearc ^ 2)+ I(yearc ^ 3), data = munich_rent_index)
summary(additive_m2)
Call:
lm(formula = rentsqm ~ I(1/area) + yearc + I(yearc^2) + I(yearc^3),
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.942e+04 2.993e+04 0.983 0.326
I(1/area) 1.296e+02 5.536e+00 23.406 <2e-16 ***
yearc -4.330e+01 4.592e+01 -0.943 0.346
I(yearc^2) 2.119e-02 2.348e-02 0.902 0.367
I(yearc^3) -3.445e-06 4.002e-06 -0.861 0.389
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
A third model is specified using the truncated year of construction (\(\text{yearc} - 1900\)):
munich_rent_index$yearco <- munich_rent_index$yearc - 1900
additive_m3 <- lm(rentsqm ~ I(1/area) + yearco + I(yearco ^ 2)+ I(yearco ^ 3), data = munich_rent_index)
summary(additive_m3)
Call:
lm(formula = rentsqm ~ I(1/area) + yearco + I(yearco^2) + I(yearco^3),
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.417e+00 4.779e-01 11.335 < 2e-16 ***
I(1/area) 1.296e+02 5.536e+00 23.406 < 2e-16 ***
yearco -9.434e-02 3.384e-02 -2.788 0.00534 **
I(yearco^2) 1.553e-03 6.728e-04 2.308 0.02105 *
I(yearco^3) -3.445e-06 4.002e-06 -0.861 0.38947
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
As in the book, we show the effect of year of construction in the polynomial of degree 3:
par(mfrow = c(1, 2))
beta_add_m2 <- additive_m2$coefficients
yearc_seq <- seq(min(munich_rent_index$yearc), max(munich_rent_index$yearc), length.out = 100)
plot_effect <- function(beta, year_seq, title){
plot(
year_seq,
beta[3] * year_seq + beta[4] * year_seq^2 + beta[5] * year_seq^3,
main = title,
xlab = 'year of construction',
ylab = 'effect of year of construction',
type = 'l',
col = 'darkblue',
lwd = 2
)
}
plot_effect(beta_add_m2, yearc_seq, 'effect of year of construction')
beta_add_m3 <- additive_m3$coefficients
truncate_yearc_seq <- seq(min(munich_rent_index$yearco), max(munich_rent_index$yearco), length.out = 100)
plot_effect(beta_add_m3, truncate_yearc_seq, 'effect of year of construction, coded from 18 to 96')
A new model is defined using the orthogonal polynomial of year of construction:
munich_rent_index$areainv <- (1 / munich_rent_index$area) - mean(1 / munich_rent_index$area)
poly_year <- poly(munich_rent_index$yearco - mean(munich_rent_index$yearco), 3)
munich_rent_index$yearcc <- poly_year[, 1]
munich_rent_index$yearcc2 <- poly_year[, 2]
munich_rent_index$yearcc3 <- poly_year[, 3]
additive_m4 <- lm(rentsqm ~ areainv + yearcc + yearcc2 + yearcc3, data = munich_rent_index)
summary(additive_m4)
Call:
lm(formula = rentsqm ~ areainv + yearcc + yearcc2 + yearcc3,
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.11126 0.03674 193.575 <2e-16 ***
areainv 129.57166 5.53582 23.406 <2e-16 ***
yearcc 43.93838 2.07260 21.200 <2e-16 ***
yearcc2 27.53892 2.05958 13.371 <2e-16 ***
yearcc3 -1.75582 2.03998 -0.861 0.389
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
Now we calculate the partial residuals for \(\text{area}\) and \(\text{yearc}\) define by:
\[ \hat{\epsilon}_{\text{area}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}} - \hat{f}_{2}(\text{yearc}_{i}) \]
With the effect \(f(\text{yearc}_{i})\):
\[ \hat{f}_{2}(\text{yearc}) = \beta_{2} \cdot \text{yearc}_{i} + \beta_{3} \cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3} \]
And:
\[ \hat{\epsilon}_{\text{yearc}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}} - \hat{f}_{1}(\text{area}_{i}) \]
With the effect \(f(\text{yearc}_{i})\):
\[ \hat{f}_{1}(\text{yearc}) = \beta_{1} \cdot \frac{1}{\text{area}_{i}} \]
par(mfrow = c(1, 2))
beta_add_m4 <- additive_m4$coefficients
area_seq <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)
area_effect <- function(area){
inv_area <- 1 / area
cent_area <- inv_area - mean(inv_area)
beta_add_m4[2] * cent_area
}
yearc_effect <- function(yearc){
yearc_center <- yearc - mean(yearc)
poly_yearc <- poly(yearc_center, 3)
beta_add_m4[3] * poly_yearc[, 1] + beta_add_m4[4] * poly_yearc[, 2] + beta_add_m4[5] * poly_yearc[, 3]
}
residual_area <- munich_rent_index$rentsqm - beta_add_m4[1] - yearc_effect(munich_rent_index$yearc)
plot(
munich_rent_index$area,
residual_area,
main = 'effect of area',
xlab = 'area in sqm',
ylab = 'effect of area / partial residuals'
)
lines(area_seq, area_effect(area_seq), col = 'red', lwd = 2)
residual_year <- munich_rent_index$rentsqm - beta_add_m4[1] - area_effect(munich_rent_index$area)
plot(
munich_rent_index$yearc,
residual_year,
main = 'effect of year of construction',
xlab = 'year of construction',
ylab = 'effect of year of construction / partial residuals'
)
lines(yearc_seq, yearc_effect(yearc_seq), col = 'red', lwd = 2)
We adjust the regression model with the coding values of location:
\[ \mathcc{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \text{glocation}_{1} + \beta_{2} \cdot \text{tlocation}_{1} \]
munich_rent_index$glocation <- 0
munich_rent_index$glocation[munich_rent_index$location == 2] <- 1
munich_rent_index$glocation[munich_rent_index$location == 1] <- -1
munich_rent_index$tlocation <- 0
munich_rent_index$tlocation[munich_rent_index$location == 3] <- 1
munich_rent_index$tlocation[munich_rent_index$location == 1] <- -1
cod_model <- lm(
rentsqm ~ glocation + tlocation,
data = munich_rent_index
)
summary(cod_model)
Call:
lm(formula = rentsqm ~ glocation + tlocation, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.8564 -1.8376 -0.1074 1.7157 10.4494
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.46704 0.09638 77.477 < 2e-16 ***
glocation -0.19479 0.10445 -1.865 0.062285 .
tlocation 0.70529 0.18558 3.800 0.000147 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.426 on 3079 degrees of freedom
Multiple R-squared: 0.008867, Adjusted R-squared: 0.008223
F-statistic: 13.77 on 2 and 3079 DF, p-value: 1.109e-06
We import the new updated model of 2001, available at official page of the dataset as all the datasets:
munich_rent_url_01 <- "https://www.uni-goettingen.de/de/document/download/e00d039e9c1f28ab404b27b0b14931f1.raw/rent98_00.raw"
munich_rent_index_01 <- read.table(
url(munich_rent_url_01),
header = 1,
colClasses = c(
"numeric", "numeric", "numeric",
"numeric", "integer", "integer",
"integer", "integer", "integer",
"integer", "numeric", "numeric",
"numeric"
)
)
summary(munich_rent_index_01)
rent_euro rentsqm_euro area yearc
Min. : 12.83 Min. : 0.1841 Min. : 20.00 Min. :1918
1st Qu.: 320.86 1st Qu.: 5.2601 1st Qu.: 52.00 1st Qu.:1939
Median : 424.12 Median : 6.8614 Median : 66.00 Median :1960
Mean : 456.94 Mean : 7.0274 Mean : 67.92 Mean :1956
3rd Qu.: 552.40 3rd Qu.: 8.7292 3rd Qu.: 82.00 3rd Qu.:1972
Max. :1837.89 Max. :17.6688 Max. :243.00 Max. :1999
glocation tlocation nkitchen pkitchen
Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.3907 Mean :0.02516 Mean :0.09888 Mean :0.04004
3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
eboden year01 yearc2 yearc3
Min. :0.0000 Min. :0.0000 Min. :3678724 Min. :7.060e+09
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3759721 1st Qu.:7.290e+09
Median :0.0000 Median :0.0000 Median :3841600 Median :7.530e+09
Mean :0.2986 Mean :0.3257 Mean :3828362 Mean :7.493e+09
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:3888784 3rd Qu.:7.670e+09
Max. :1.0000 Max. :1.0000 Max. :3996001 Max. :7.990e+09
invarea
Min. :0.004115
1st Qu.:0.012195
Median :0.015152
Mean :0.016904
3rd Qu.:0.019231
Max. :0.050000
And we adjust the model:
\[ \begin{matrix} \widehat{\text{rentsqm}}_{i} = & \beta_{0} + \beta_{1} \text{areainvc}_{i} + \beta_{2} \text{yearco}_{i} + \beta_{3} \text{yearco2}_{i} + \beta_{4} \text{yearco3}_{i} \\ & + \beta_{5} \text{year01}_{i} + \beta_{6} \text{nkitchen}_{i} + \beta_{7} \text{pkitchen}_{i} \\ & + \beta_{8} \text{nkitchen}_{i} \cdot \text{year01}_{i} + \beta_{9} \text{pkitchen}_{i} \cdot \text{year01}_{i} \end{matrix} \]
munich_rent_index_01$areainvc <- munich_rent_index_01$invarea - mean(munich_rent_index_01$invarea)
munich_rent_index_01$yearco <- munich_rent_index_01$yearc - mean(munich_rent_index_01$yearc)
poly_year_01 <- poly(munich_rent_index_01$yearco, 3)
munich_rent_index_01$yearco <- poly_year_01[, 1]
munich_rent_index_01$yearco2 <- poly_year_01[, 2]
munich_rent_index_01$yearco3 <- poly_year_01[, 3]
interaction_model <- lm(
rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + year01 + nkitchen + pkitchen + nkitchen * year01 + pkitchen * year01,
data = munich_rent_index_01
)
summary(interaction_model)
Call:
lm(formula = rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 +
year01 + nkitchen + pkitchen + nkitchen * year01 + pkitchen *
year01, data = munich_rent_index_01)
Residuals:
Min 1Q Median 3Q Max
-7.6303 -1.2647 -0.1072 1.2414 10.5233
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.95424 0.03841 181.072 < 2e-16 ***
areainvc 123.71308 4.42630 27.950 < 2e-16 ***
yearco 49.42282 2.02589 24.396 < 2e-16 ***
yearco2 29.45781 2.02386 14.555 < 2e-16 ***
yearco3 -1.03886 1.97176 -0.527 0.598311
year01 -0.25174 0.06678 -3.770 0.000166 ***
nkitchen 0.91567 0.12172 7.523 6.43e-14 ***
pkitchen 1.09081 0.17849 6.111 1.07e-09 ***
year01:nkitchen 0.39826 0.20922 1.904 0.057033 .
year01:pkitchen 0.73511 0.32864 2.237 0.025346 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.963 on 4561 degrees of freedom
Multiple R-squared: 0.3397, Adjusted R-squared: 0.3384
F-statistic: 260.7 on 9 and 4561 DF, p-value: < 2.2e-16
The model to adjust ius defined as:
\[ \mathbb{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \text{tlocation} + \beta_{2} \cdot \overline{\frac{1}{\text{area}}} + \beta{3} \cdot \text{area} \cdot \text{tlocation} \]
We use the dataset for average or top location only and change the dummy variable \(\text{tlocation}\) to \(0\) (average) and \(1\) (top). Finally, we visualize the effect as described in the book.
par(mfrow = c(1, 2))
at_rent <- subset(munich_rent_index, location == 1 | location == 3)
at_rent$tlocation[at_rent$location == 1] <- 0
summary(at_rent)
rent rentsqm area yearc location
Min. : 54.92 Min. : 1.417 Min. : 20.00 Min. :1918 1:1794
1st Qu.: 314.62 1st Qu.: 5.276 1st Qu.: 50.00 1st Qu.:1954 2: 0
Median : 418.00 Median : 6.849 Median : 65.00 Median :1962 3: 78
Mean : 444.22 Mean : 7.007 Mean : 66.02 Mean :1959
3rd Qu.: 537.42 3rd Qu.: 8.652 3rd Qu.: 80.00 3rd Qu.:1972
Max. :1459.23 Max. :15.428 Max. :155.00 Max. :1997
bath kitchen cheating district yearco
Mode :logical Mode :logical Mode :logical 623 : 53 Min. :18.00
FALSE:1761 FALSE:1785 FALSE:148 1013 : 37 1st Qu.:54.00
TRUE :111 TRUE :87 TRUE :1724 713 : 32 Median :62.00
1132 : 31 Mean :59.36
1712 : 30 3rd Qu.:72.00
2024 : 30 Max. :97.00
(Other):1659
areainv yearcc yearcc2
Min. :-0.0105206 Min. :-0.030935 Min. :-0.0183373
1st Qu.:-0.0044722 1st Qu.:-0.001862 1st Qu.:-0.0147283
Median :-0.0015876 Median : 0.004598 Median :-0.0067317
Mean : 0.0002152 Mean : 0.002465 Mean :-0.0008428
3rd Qu.: 0.0030278 3rd Qu.: 0.012674 3rd Qu.: 0.0116794
Max. : 0.0330278 Max. : 0.032863 Max. : 0.0537875
yearcc3 glocation tlocation
Min. :-0.0186291 Min. :-1.0000 Min. :0.00000
1st Qu.:-0.0172309 1st Qu.:-1.0000 1st Qu.:0.00000
Median :-0.0056305 Median :-1.0000 Median :0.00000
Mean :-0.0004071 Mean :-0.9583 Mean :0.04167
3rd Qu.: 0.0096124 3rd Qu.:-1.0000 3rd Qu.:0.00000
Max. : 0.0588009 Max. : 0.0000 Max. :1.00000
area_location_model <- lm(
rentsqm ~ tlocation + areainv + area:tlocation,
data = at_rent
)
summary(area_location_model)
Call:
lm(formula = rentsqm ~ tlocation + areainv + area:tlocation,
data = at_rent)
Residuals:
Min 1Q Median 3Q Max
-7.2380 -1.4435 -0.1189 1.4788 7.1162
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.911e+00 4.967e-02 139.131 <2e-16 ***
tlocation 7.722e-01 7.280e-01 1.061 0.289
areainv 1.431e+02 7.434e+00 19.252 <2e-16 ***
tlocation:area 1.001e-02 8.612e-03 1.163 0.245
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.101 on 1868 degrees of freedom
Multiple R-squared: 0.1775, Adjusted R-squared: 0.1762
F-statistic: 134.4 on 3 and 1868 DF, p-value: < 2.2e-16
beta_al <- area_location_model$coefficients
f1_area <- function(area){
areainvc <- (1 / area) - mean(1 / area)
beta_al[3] * areainvc
}
f2_area <- function(area){
beta_al[4] * area
}
small_effect <- f1_area(area_seq)
big_effect <- beta_al[2] + f1_area(area_seq) + f2_area(area_seq)
ylim <- c(min(c(small_effect, big_effect)), max(c(small_effect, big_effect)))
plot(
area_seq,
small_effect,
main = 'area effects based on a linear interaction',
xlab = 'area in sqm',
ylab = 'area effects',
ylim = ylim,
type = 'l',
col = 'darkblue',
lwd = 2
)
lines(area_seq, big_effect, col = 'red', lwd = 2)
plot(
area_seq,
beta_al[2] + f2_area(area_seq),
main = 'varying effect of top location',
xlab = 'area in sqm',
ylab = 'Varying effect of top location',
type = 'l',
col = 'darkblue',
lwd = 2
)
Just for exemplify, we define the orthogonal process describe as an R function:
\[ \tilde{x}^{j} = x^{j} - \widetilde{X}_{j} \left ( {\widetilde{X}_{j}}^{T} \widetilde{X}_{j} \right )^{-1} {\widetilde{X}_{j}}^{T} x^{j} \]
orthogonal_design <- function(x){
x_out <- cbind(x[, 1] - mean(x[, 1]), matrix(0, nrow = nrow(x), ncol = ncol(x) - 1))
for (i in 2:ncol(x)){
x_hat <- as.matrix(x_out[, 1:(i-1)])
x_inv <- solve(t(x_hat) %*% x_hat)
x_out[, i] <- x[, i] - x_hat %*% x_inv %*% t(x_hat) %*% x[, i]
}
x_out
}
We test this function on the polynomial of the \(\text{yearc}\) variable:
poly_test <- poly(munich_rent_index$yearc, 3, raw = TRUE)
orthogonal_test <- orthogonal_design(poly_test)
# And check it
c(
orthogonal_test[, 1] %*% orthogonal_test[, 2],
orthogonal_test[, 2] %*% orthogonal_test[, 3],
orthogonal_test[, 1] %*% orthogonal_test[, 3]
)
[1] 7.550556e-05 8.482947e+05 2.602290e+00
Why did it not result?
We compare the models: \(M1\), \(M2\), \(M3\), and \(M4\) using its coefficient of determination \(R^{2}\)
coeff_deter <- data.frame(
`R Squared` = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared, summary(m4)$r.squared),
row.names = c("M1", "M2", "M3", "M4")
)
coeff_deter
For the model:
\[ y_{i} = \beta x_{i} + \epsilon_{i} \]
We can verify:
\[ \left ( {X_{n}}^{T} X_{n} \right )^{-1} \to 0 \]
And:
\[ \max_{i=1,...,n} {x_{i}}^{T} \left ( {X_{n}}^{T} X_{n} \right )^{-1} x_{i} \to 0 \text{ for } n \to \infty \]
\[ \left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( (1, 2, 3, \cdots, n) \left ( \begin{matrix} 1 \\ 2 \\ \vdots \\ n \end{matrix} \right ) \right )^{-1} = \left ( 1 + 4 + \cdots + n^2 \right )^{-1} \to 0 \]
And:
\[ \max_{i=1,...,n} ( 1, 2, \cdots, n ) \left ( 1 + 4 + \cdots + n^2 \right )^{-1} \left ( \begin{matrix} 1 \\ 2 \\ \vdots \\ n \end{matrix} \right ) \to 0 \text{ for } n \to \infty \]
\[ \left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \left (1, 0.5, 0.33, \cdots, \frac{1}{n} \right ) \left ( \begin{matrix} 1 \\ 0.5 \\ \vdots \\ \frac{1}{n} \end{matrix} \right ) \right )^{-1} = \left ( \sum_{i=1}^{n} \frac{1}{i^{2}} \right )^{-1} \not{\to} 0 \]
And:
\[ \max_{i=1,...,n} \left ( 1, 0.5, \cdots, \frac{1}{n} \right ) \left ( \sum_{i = 1}^{n} \frac{1}{i^{2}} \right )^{-1} \left ( \begin{matrix} 1 \\ 0.5 \\ \vdots \\ \frac{1}{n} \end{matrix} \right ) \not{\to} 0 \text{ for } n \to \infty \]
\[ \left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \sum_{i = 1}^{n} \frac{1}{i} \right )^{-1} \to 0 \]
And:
\[ \max_{i=1,...,n} \left ( 1, \frac{1}{\sqrt{2}}, \cdots, \frac{1}{\sqrt{n}} \right ) \left ( \sum_{i = 1}^{n} \frac{1}{i} \right )^{-1} \begin{matrix} 1 \\ \frac{1}{\sqrt{2}} \\ \vdots \\ \frac{1}{\sqrt{n}} \end{matrix} \to 0 \]
The regression model to adjust is:
\[ \begin{matrix} \text{rentsqm}_{i} = & \beta_{0} + \beta_{1} \text{areainvc}_{i} + \beta_{2} \text{yearco}_{i} + \beta_{3} \text{yearco2}_{i} + \beta_{4} \text{yearco3}_{i} \\ & + \beta_{5} \text{nkitchen}_{i} + \beta_{6} \text{pkitchen}_{i} + \beta_{7} \text{year01}_{i} + \epsilon_{i} \end{matrix} \]
hypothesis_model <- lm(
rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + nkitchen + pkitchen + year01,
data = munich_rent_index_01
)
summary(hypothesis_model)
Call:
lm(formula = rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 +
nkitchen + pkitchen + year01, data = munich_rent_index_01)
Residuals:
Min 1Q Median 3Q Max
-7.674 -1.270 -0.113 1.242 10.479
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.93239 0.03757 184.539 < 2e-16 ***
areainvc 123.76987 4.42908 27.945 < 2e-16 ***
yearco 49.37274 2.02573 24.373 < 2e-16 ***
yearco2 29.49985 2.02515 14.567 < 2e-16 ***
yearco3 -0.88418 1.97229 -0.448 0.65396
nkitchen 1.04340 0.10151 10.279 < 2e-16 ***
pkitchen 1.30205 0.15226 8.552 < 2e-16 ***
year01 -0.18524 0.06213 -2.981 0.00288 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.964 on 4563 degrees of freedom
Multiple R-squared: 0.3385, Adjusted R-squared: 0.3375
F-statistic: 333.6 on 7 and 4563 DF, p-value: < 2.2e-16
So we want to test the hypothesis:
\[ \begin{matrix} H_{0} : \beta_{7} = 0 & \text{against} & H_{1} : \beta_{7} \neq 0 \end{matrix} \]
About the significance of the follow-up measure.
\[ \begin{matrix} H_{0} : \left ( \begin{matrix} \beta_{5} \\ \beta_{6} \end{matrix} \right ) = \left ( \begin{matrix} 0 \\ 0 \end{matrix} \right ) & \text{against} & H_{1} : \left ( \begin{matrix} \beta_{5} \\ \beta_{6} \end{matrix} \right ) \neq \left ( \begin{matrix} 0 \\ 0 \end{matrix} \right ) \end{matrix} \]
About the significance of the kitchen variable. And a final test about the significance of differentiate between these two type of kitchens:
\[ \begin{matrix} H_{0} : \beta_{5} - \beta_{6} = 0 & \text{against} & H_{1} : \beta_{5} - \beta_{6} \neq 0 \end{matrix} \]
For the first test (\(H_{0} : \beta_{7} = 0\)), we calculate:
\[ t_{7} = \frac{\hat{\beta}_{7}}{{\widehat{\text{Var}(\hat{\beta}_{7})}}^{1/2}} \sim t_{1 - \alpha / 2} (n - p) \]
The covariance matrix is:
cov_matrix <- vcov(hypothesis_model)
cov_matrix
(Intercept) areainvc yearco yearco2 yearco3
(Intercept) 0.001411211 0.006823735 0.004991278 0.005173670 -0.002333528
areainvc 0.006823735 19.616785356 -1.294964483 1.420604414 0.122987111
yearco 0.004991278 -1.294964483 4.103591466 0.046451741 -0.006404590
yearco2 0.005173670 1.420604414 0.046451741 4.101229176 0.027782090
yearco3 -0.002333528 0.122987111 -0.006404590 0.027782090 3.889943976
nkitchen -0.001093652 -0.091944737 -0.025946481 -0.026748293 0.006345756
pkitchen -0.001128930 0.022989833 -0.042249319 -0.049085309 -0.016014007
year01 -0.001270640 0.004137400 -0.002253660 -0.001730023 0.007205398
nkitchen pkitchen year01
(Intercept) -1.093652e-03 -0.0011289298 -1.270640e-03
areainvc -9.194474e-02 0.0229898326 4.137400e-03
yearco -2.594648e-02 -0.0422493193 -2.253660e-03
yearco2 -2.674829e-02 -0.0490853093 -1.730023e-03
yearco3 6.345756e-03 -0.0160140070 7.205398e-03
nkitchen 1.030419e-02 0.0014042193 5.682974e-05
pkitchen 1.404219e-03 0.0231824417 1.902243e-04
year01 5.682974e-05 0.0001902243 3.860040e-03
We check the \(\text{Var}(\hat{\beta}_{7})\) is the last value.
var_7 <- cov_matrix[8, 8]
t_7 <- hypothesis_model$coefficients[8] / sqrt(var_7)
t_7
year01
-2.981496
And this coincides with the value given in the
summary.
For the second test, we compute the \(F\) value as:
\[ F = \frac{1}{r} {\hat{\beta}}^{T} \widehat{\text{Cov} \left ( \hat{\beta} \right )}^{-1} \hat{\beta} \sim F_{1, n - p} \]
With the \(\hat{\beta}\) define in the test \(\left ( \begin{matrix} \hat{\beta_{5}} \\ \hat{\beta_{6}} \end{matrix} \right )\), then \(r=2\) and \(n - p = n - 8\):
df <- nrow(munich_rent_index_01) - 8
r <- 2
beta_hat <- as.matrix(hypothesis_model$coefficients[6:7])
f_56 <- (1/2) * t(beta_hat) %*% solve(cov_matrix[6:7, 6:7]) %*% beta_hat
f_56
[,1]
[1,] 82.08307
We get expected \(F\):
f_crit <- qf(p = 0.95, r, df)
f_crit
[1] 2.9977
And we reject the null hypothesis.
The third and final test comparing \(\beta_{5}\) and \(\beta_{6}\) between each other. We need:
\[ \widehat{\text{Var}(\hat{\beta}_{5} - \hat{\beta}_{6})} = \widehat{\text{Var}(\hat{\beta}_{5})} + \widehat{\text{Var}(\hat{\beta}_{6})} - 2 \cdot \widehat{\text{Cov}(\hat{\beta}_{5}, \hat{\beta}_{6})} \]
Using the \(\widehat{\text{Cov}(\hat{\beta})}\) matrix:
cov_5_6 <- cov_matrix[6, 7]
var_5 <- cov_matrix[6, 6]
var_6 <- cov_matrix[7, 7]
var_5_6 <- var_5 + var_6 - 2 * cov_5_6
f_5_6 <- (hypothesis_model$coefficients[6] - hypothesis_model$coefficients[7])^2 / var_5_6
f_5_6
nkitchen
2.18071
And this \(F\) critical, using \(r=1\):
f_crit <- qf(p = 0.95, 1, df)
f_crit
[1] 3.843498
So, in this case, we do not reject the null hypothesis.
The confidence interval for \(\beta_{7}\) is obtain using:
\[ \beta_{7} \pm t_{n - p} \left (1 - \alpha / 2 \right ) \cdot \widehat{\text{Var}(\hat{\beta}_{7})}^{1/2} \]
inter <- qt(1 - 0.05 / 2, df) * sqrt(var_7)
c(hypothesis_model$coefficients[8] - inter, hypothesis_model$coefficients[8] + inter)
year01 year01
-0.3070414 -0.0634347
Using the prediction interval:
\[ {x_{0}}^{T} \cdot \hat{\beta} \pm t_{n-p}(1 - \alpha / 2) \hat{\sigma} \left ( 1 + {x_{0}}^{T} \left (X^{T} X \right )^{1} x_{0} \right )^{1/2} \]
We can obtain the \(\hat{\sigma}\) directly from the model or using:
\[ (n - p) \hat{\sigma}^2 = \epsilon^{T} \epsilon \]
sigma_model <- summary(hypothesis_model)$sigma
sigma_2 <- t(as.matrix(hypothesis_model$residuals)) %*% as.matrix(hypothesis_model$residuals) / (nrow(munich_rent_index_01) - 8)
sigma <- sqrt(sigma_2)
c(sigma_model, sigma)
[1] 1.964112 1.964112
We are also going to use the design matrix:
design_matrix <- as.matrix(hypothesis_model$model)
design_matrix[, 1] <- 1
colnames(design_matrix)[1] <- "1"
head(design_matrix)
1 areainvc yearco yearco2 yearco3 nkitchen pkitchen
1 1 0.0116672025 -0.011568466 -0.010460264 0.030118199 0 0
2 1 -0.0072887975 -0.011568466 -0.010460264 0.030118199 0 0
3 1 0.0175786025 0.009594692 -0.004320479 -0.013940551 0 0
4 1 0.0087368025 0.010256041 -0.003177207 -0.014569233 0 0
5 1 -0.0065948975 0.018853574 0.016932467 -0.005009151 0 0
6 1 -0.0007751975 0.003642554 -0.012015191 -0.002877678 0 0
year01
1 0
2 0
3 0
4 0
5 0
6 0
area_seq <- seq(min(munich_rent_index_01$area), max(munich_rent_index_01$area), length.out = 100)
areainvc <- (1 / area_seq) - mean(1 / area_seq)
betam <- hypothesis_model$coefficients
yearcc <- munich_rent_index_01[munich_rent_index_01$yearc == 1918, ][1, ]
# Three last terms were added for completeness
constant <- betam[1] + betam[3] * yearcc$yearco + betam[4] * yearcc$yearco2 + betam[5] * yearcc$yearco3 + betam[6] * 0 + betam[7] * 0 + betam[8] * 0
plot(
munich_rent_index_01$area,
munich_rent_index_01$rentsqm_euro,
xlab = 'area in sqm',
ylab = 'estimated rent per sqm'
)
beta_1 <- hypothesis_model$coefficients[2]
lines(area_seq, constant + beta_1 * areainvc, col = 'red', lwd = 2)
conf_inter <- qt(1 - 0.05 / 2, df) * sqrt(cov_matrix[2, 2])
beta_conf <- c(beta_1 - conf_inter, beta_1 + conf_inter)
lines(area_seq, constant + beta_conf[1] * areainvc, col = 'darkblue', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc, col = 'darkblue', lwd = 2)
x_o <- as.matrix(cbind(
1,
areainvc,
yearcc$yearco,
yearcc$yearco2,
yearcc$yearco3,
0,
0,
0
))
pred_inter <- NULL
for (i in seq_len(nrow(x_o)))
x_o_i <- as.matrix(x_o[i, ])
pred_inter <- c(pred_inter, qt(1 - 0.05 / 2, df) * sigma * sqrt(
1 + (
t(x_o_i) %*% solve(t(design_matrix)%*%design_matrix) %*% x_o_i
))
)
lines(area_seq, constant + beta_conf[1] * areainvc - pred_inter, col = 'gray', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc + pred_inter, col = 'gray', lwd = 2)
This can also be done using predict:
x_o <- as.data.frame(cbind(
areainvc,
yearcc$yearco,
yearcc$yearco2,
yearcc$yearco3,
0,
0,
0
))
colnames(x_o) <- colnames(hypothesis_model$model)[2:8]
confidence_interval <- predict(hypothesis_model, x_o, interval = "confidence")
plot(
munich_rent_index_01$area,
munich_rent_index_01$rentsqm_euro,
xlab = 'area in sqm',
ylab = 'estimated rent per sqm'
)
lines(area_seq, confidence_interval[, "fit"], col = 'red', lwd = 2)
lines(area_seq, confidence_interval[, "lwr"], col = 'darkblue', lwd = 2)
lines(area_seq, confidence_interval[, "upr"], col = 'darkblue', lwd = 2)
prediction_interval <- predict(hypothesis_model, x_o, interval = "prediction")
lines(area_seq, prediction_interval[, "lwr"], col = 'gray', lwd = 2)
lines(area_seq, prediction_interval[, "upr"], col = 'gray', lwd = 2)